The various methods and algorithms of machine learning are explored and tested with the purpose to regionalize specific capacity for the study area of the Lake Chad Basin. Specific capacity was obtained from puming tests at 1387 locations.Various geoscientific parameters are used and tested for training and building the models, such as elevation, river density, lithology, etc. But before getting started, some presettings are necessary to simplify coding and reproducability.

1 Environment Presettings

1.1 Set working directory

Set the working directory relative to the folder containing the source script.

setwd(substr(
  rstudioapi::getActiveDocumentContext()$path,
  1,
  rev(gregexpr(
    "/", rstudioapi::getActiveDocumentContext()$path
  )[[1]])[2]
))

1.2 External Functions

Often needed parts of the scipt were outsourced to external functions. This simplifies and shortens the code for a more comprehensible reading. The following lines source these functions from the folder called functions with a relative path to the active documents path.

purrr::map(
  list.files(
    path = './scripts/functions',
    pattern = "\\.R$",
    full.names = TRUE,
    recursive = TRUE
  ),
  source
)

1.3 Required Packages

Install and load all packages within the brackets of the pacman::p_load() function. Note: Load the tidyverse package last to prevent functions to be masked by other packages.

if (!require("pacman"))
  install.packages("pacman")
pacman::p_load(
  readxl,
  janitor,
  ggplot2,
  kable,
  kableExtra,
  png,
  FSA,
  sp,
  strict,
  glue,
  leaflet,
  leaflet.extras,
  htmlwidgets,
  htmltools,
  plotly,
  RANN,
  plyr,
  dplyr,
  spdplyr,
  rgeos,
  rgdal,
  raster,
  tidyverse
)

2 Data Import

2.1 Stammdaten

stammdaten <- readxl::read_excel('./raw_data/Datensatz/Stammdaten_D.xlsx') %>% 
  as_tibble() %>% 
  clean_names()

names(stammdaten) <- c('obswell_id', 'name', 'lon', 'lat')

2.2 Corine Land Cover 2018 (CLC2018)

CLC2018 data for whole Europe was downloaded from: https://land.copernicus.eu/pan-european/corine-land-cover/clc2018?tab=download

clc2018_data <-
  readOGR('./processed_data/clc_as_shp/corine_de.shp') %>%
  clean_names() %>%
  dplyr::mutate(code_18 = as.integer(as.character(code_18)))
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\processed_data\clc_as_shp\corine_de.shp", layer: "corine_de"
## with 233551 features
## It has 6 fields

CLC2018 Legend

clc2018_legend <- read_excel('./raw_data/CLC2018/Legend/clc_legend.xls') %>% 
  clean_names()

2.3 Hydrogeological Units

2.3.1 First Time

Load hydrogeological unit data

# hydr_unit <- readOGR('./raw_data/hyraum_v32/hyraum_gr__v32_poly.shp', encoding = "UTF-8/LATIN-1/...") %>% 
#   clean_names() %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))
# 
# hydr_unit <- hydr_unit %>% 
#   dplyr::mutate(gr_name = str_replace_all(gr_name, c("ü" = "ue", "ö" = "oe", "ä" = "ae", "ß" = "sz"))) %>% 
#   dplyr::mutate(gr_nr_name = str_replace_all(gr_nr_name, c("ü" = "ue", "ö" = "oe", "ä" = "ae", "ß" = "sz")))
# 
# hydr_unit %>% writeOGR('./processed_data/hydr_unit_umlaute_rm.shp', layer = 'hydr_unit_umlaute_rm', driver="ESRI Shapefile")

2.3.2 Second Time

Load hydrogeological unit data

hydr_unit <- readOGR('./processed_data/hydr_unit_umlaute_rm.shp') %>% 
  clean_names() %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\processed_data\hydr_unit_umlaute_rm.shp", layer: "hydr_unit_umlaute_rm"
## with 11 features
## It has 5 fields

2.4 Water production Sites

Data on groundwater pumping stations was obtained from the HAD Thema 7.2 provided by Bundesanstalt für Gewässerkunde (BfG) Source: „Hydrologischer Atlas von Deutschland/BfG, 2000“

Import

public_water_prod_sp <- readOGR('./raw_data/had/off_wassergewinnung.shp')
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\off_wassergewinnung.shp", layer: "off_wassergewinnung"
## with 1230 features
## It has 33 fields
## Integer64 fields read as strings:  TK X_LAMBERT Y_LAMBERT
mining_water_prod_sp <- readOGR('./raw_data/had/bergbau_wassergewinnung.shp')
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\bergbau_wassergewinnung.shp", layer: "bergbau_wassergewinnung"
## with 379 features
## It has 30 fields
## Integer64 fields read as strings:  TK X_LAMBERT Y_LAMBERT
powerplant_water_prod_sp <- readOGR('./raw_data/had/kraftwerke.shp')
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\kraftwerke.shp", layer: "kraftwerke"
## with 62 features
## It has 31 fields
## Integer64 fields read as strings:  TK X_LAMBERT Y_LAMBERT
reservoir_sp <- readOGR('./raw_data/had/talsperre.shp')
## OGR data source with driver: ESRI Shapefile 
## Source: "C:\Noelscher.M\Projects\13_spatial_analysis_12Kobswell\r\raw_data\had\talsperre.shp", layer: "talsperre"
## with 64 features
## It has 47 fields
## Integer64 fields read as strings:  LFD_NR R_WERT H_WERT XLAMBERT YLAMBERT NR_HAD

3 Data Preparation

3.1 Start date of time series

3.1.1 First time

First we list all files in the directory

# files_list <- list.files(path = "./raw_data/Datensatz/Messstellen preprocessed/", pattern = "*.txt")

Then we apply a custom function to read in all files, bind them and extract the lines with first observation grouped by the wells This implementation to find the start date is very slow, next time it would be worth trying another method.

# start_date <- files_list %>% extract_start_date()

Now we ungroup the dataframe

# start_date <- start_date %>% ungroup()

# names(start_date) <- c(names(stammdaten)[1], 'name', 'start_date', 'value')

As the previous steps took a few ours, we store the result as .csv

# start_date %>% write_csv2(path = './processed_data/start_date.csv')

3.1.2 Second time continue here…

Read in start_date.csv

start_date <- read_csv2('./processed_data/start_date.csv')

3.2 Data Wrangling

Join the 2 dataframes by their common obswell_id

stammdaten_join_startdate <- stammdaten %>%
  dplyr::select(-name) %>% 
  left_join(start_date, by = 'obswell_id')

Show example entries

stammdaten_join_startdate %>% headtail()
##          obswell_id    lon     lat                        name start_date
## 1       BB_25470023 808527 5928687               Ottenhagen OP 2000-11-20
## 2       BB_25470024 808530 5928686               Ottenhagen UP 2000-12-18
## 3       BB_25480025 820407 5933409                 Neumannshof 2000-11-20
## 13490 TH_5633900114 660999 5581364       Hy Heinersdorf 1_2004 2007-01-08
## 13491 TH_5729240555 616545 5566272 Hy Schweickershausen 1_2002 2007-01-08
## 13492 TH_5730000077 625563 5568125          Br.Lindenau (0006) 2007-01-08
##         value
## 1       78.96
## 2       78.78
## 3       34.93
## 13490 429.972
## 13491   348.9
## 13492  281.26

Now we separate the coordinates from the dataframe

coords <- stammdaten_join_startdate %>% 
  dplyr::select(lon, lat)

Show example entries

coords %>% headtail()
##          lon     lat
## 1     808527 5928687
## 2     808530 5928686
## 3     820407 5933409
## 13490 660999 5581364
## 13491 616545 5566272
## 13492 625563 5568125

Delete coordinate columns from dataframe

stammdaten_join_startdate <- stammdaten_join_startdate %>% 
  dplyr::select(-one_of(colnames(coords)))

Show example entries

stammdaten_join_startdate %>% headtail()
##          obswell_id                        name start_date   value
## 1       BB_25470023               Ottenhagen OP 2000-11-20   78.96
## 2       BB_25470024               Ottenhagen UP 2000-12-18   78.78
## 3       BB_25480025                 Neumannshof 2000-11-20   34.93
## 13490 TH_5633900114       Hy Heinersdorf 1_2004 2007-01-08 429.972
## 13491 TH_5729240555 Hy Schweickershausen 1_2002 2007-01-08   348.9
## 13492 TH_5730000077          Br.Lindenau (0006) 2007-01-08  281.26

3.3 Conversion to Spatial Data

Now we create a SpatialPointsdataframe from our stammdaten dataframe

obswell_sp <- SpatialPointsDataFrame(coords = coords,
                                     data = stammdaten_join_startdate,
                                     proj4string = CRS('+proj=utm +zone=32 +ellps=GRS80 +units=m +no_defs'))

The coodinate reference system needs to be transformed to the one leaflet is expecting

obswell_sp <- obswell_sp %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))

4 Visualization

We create a color palette from the start dates

pal <- colorNumeric(
  palette = "Spectral",
  domain = as.numeric(obswell_sp$start_date))

We create a leaflet html GIS

map1 <- obswell_sp %>% create_custom_html_leafletmap1()

Plot the leaflet GIS

map1

Save the leaflet widget

saveWidget(map1, file='map1.html')

5 Selection of Training Data

For the classification of the time series as class anthropogenic or class natural training data is needed The selection of the training data for these two classes is described in the following chapter

Water sources per federal state in Germany

Water sources per federal state in Germany, Source: Grundwasser in Deutschland - Bundesministerium für Umwelt, Naturschutz und Reaktorsicherheit (BMU), 2008




Withdrawal of water for irrigation grouped by groundwater and surface water

Withdrawal of water for irrigation grouped by groundwater and surface water, Source: Statistik in der Wasserversorgung in der Landwirtschaft 2002 - Statistisches Bundesamt, 2004

5.1 Method 1 - Corine Land Cover 2018

5.1.1 Data Preparation

Show legend as table

clc2018_legend %>%
  dplyr::select(-grid_code, -rgb) %>%
  kable() %>%
  kable_styling(bootstrap_options = "striped",
                full_width = TRUE,
                font_size = 9) %>%
  scroll_box(width = "100%", height = "400px")
clc_code label1 label2 label3
111 Artificial surfaces Urban fabric Continuous urban fabric
112 Artificial surfaces Urban fabric Discontinuous urban fabric
121 Artificial surfaces Industrial, commercial and transport units Industrial or commercial units
122 Artificial surfaces Industrial, commercial and transport units Road and rail networks and associated land
123 Artificial surfaces Industrial, commercial and transport units Port areas
124 Artificial surfaces Industrial, commercial and transport units Airports
131 Artificial surfaces Mine, dump and construction sites Mineral extraction sites
132 Artificial surfaces Mine, dump and construction sites Dump sites
133 Artificial surfaces Mine, dump and construction sites Construction sites
141 Artificial surfaces Artificial, non-agricultural vegetated areas Green urban areas
142 Artificial surfaces Artificial, non-agricultural vegetated areas Sport and leisure facilities
211 Agricultural areas Arable land Non-irrigated arable land
212 Agricultural areas Arable land Permanently irrigated land
213 Agricultural areas Arable land Rice fields
221 Agricultural areas Permanent crops Vineyards
222 Agricultural areas Permanent crops Fruit trees and berry plantations
223 Agricultural areas Permanent crops Olive groves
231 Agricultural areas Pastures Pastures
241 Agricultural areas Heterogeneous agricultural areas Annual crops associated with permanent crops
242 Agricultural areas Heterogeneous agricultural areas Complex cultivation patterns
243 Agricultural areas Heterogeneous agricultural areas Land principally occupied by agriculture, with significant areas of natural vegetation
244 Agricultural areas Heterogeneous agricultural areas Agro-forestry areas
311 Forest and semi natural areas Forests Broad-leaved forest
312 Forest and semi natural areas Forests Coniferous forest
313 Forest and semi natural areas Forests Mixed forest
321 Forest and semi natural areas Scrub and/or herbaceous vegetation associations Natural grasslands
322 Forest and semi natural areas Scrub and/or herbaceous vegetation associations Moors and heathland
323 Forest and semi natural areas Scrub and/or herbaceous vegetation associations Sclerophyllous vegetation
324 Forest and semi natural areas Scrub and/or herbaceous vegetation associations Transitional woodland-shrub
331 Forest and semi natural areas Open spaces with little or no vegetation Beaches, dunes, sands
332 Forest and semi natural areas Open spaces with little or no vegetation Bare rocks
333 Forest and semi natural areas Open spaces with little or no vegetation Sparsely vegetated areas
334 Forest and semi natural areas Open spaces with little or no vegetation Burnt areas
335 Forest and semi natural areas Open spaces with little or no vegetation Glaciers and perpetual snow
411 Wetlands Inland wetlands Inland marshes
412 Wetlands Inland wetlands Peat bogs
421 Wetlands Maritime wetlands Salt marshes
422 Wetlands Maritime wetlands Salines
423 Wetlands Maritime wetlands Intertidal flats
511 Water bodies Inland waters Water courses
512 Water bodies Inland waters Water bodies
521 Water bodies Marine waters Coastal lagoons
522 Water bodies Marine waters Estuaries
523 Water bodies Marine waters Sea and ocean
999 NODATA NODATA NODATA
990 UNCLASSIFIED UNCLASSIFIED LAND SURFACE UNCLASSIFIED LAND SURFACE
995 UNCLASSIFIED UNCLASSIFIED WATER BODIES UNCLASSIFIED WATER BODIES
990 UNCLASSIFIED UNCLASSIFIED UNCLASSIFIED

Transform CRS to planar projection

# clc2018_data <- clc2018_data %>% spTransform(CRS('+proj=utm +zone=32 +ellps=WGS84 +datum=WGS84 +units=m +no_defs'))
obswell_sp <- obswell_sp %>% 
  spTransform(CRS('+proj=utm +zone=32 +ellps=WGS84 +datum=WGS84 +units=m +no_defs'))

Filter out polygons that describe artificial surfaces(built-up, airports, industrial)

clc2018_data_anthropogenic <- clc2018_data %>% 
  dplyr::filter(code_18 <=142)

First time: Find distance to nearest polygon #’ +eval=FALSE

# nearest_neighbor_clc <- nngeo::st_nn(
#   sf::st_as_sf(obswell_sp),
#   sf::st_as_sf(clc2018_data_anthropogenic),
#   k = 1,
#   returnDist = TRUE
# )

Extract results and convert to dataframe

# nearest_neighbor_clc_df <- tibble(obswell_id = obswell_sp$obswell_id,
#                                   index = unlist(nearest_neighbor_clc$nn),
#                                   distance = nearest_neighbor_clc$dist[,1])

Write dataframe to csv

# nearest_neighbor_clc_df %>% write_csv2('./processed_data/nearest_neighbor_clc.csv')

Second time: Import results

nearest_neighbor_clc_df <- read_csv2('./processed_data/nearest_neighbor_clc.csv')

Plot distribution of distances between observation wells and nearest artificial surface

nearest_neighbor_clc_df %>% plot_histogram_custom1()

Backtransformation of CRS

clc2018_data <- clc2018_data %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))
obswell_sp <- obswell_sp %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))

Plot the hydrogeological units

# ggplot() +
#   ggspatial::geom_spatial_polygon(
#     data = ggspatial::df_spatial(hydr_unit),
#     aes(x, y),
#     fill = NA,
#     colour = "black"
#   )
hydr_unit %>% raster::plot(col = RColorBrewer::brewer.pal(10, 'Paired'))

Intersect observation wells with hydrogeological units

obswell_sp <-
  obswell_sp %>% dplyr::mutate(gr_name = as_vector(sp::over(
    obswell_sp, dplyr::select(hydr_unit, gr_name)
  )))

5.1.2 Class Antropogenic

For the selection of observation wells that are antropogenically influenced, we choose observation wells that are locate within an artificial surface(built-up, airports, industrial). This means distance == 0. Join dataframes

nearest_neighbor_clc_df <-
  nearest_neighbor_clc_df %>% left_join(as_tibble(obswell_sp), by = 'obswell_id') %>% 
  dplyr::select(-name, -lon, -lat, -start_date, -value)

Choose maximum distance of observation well from artificial surfaces max_distance <- 0 means, that the observation wells are locate within artificial surfaces

max_distance <- 0

Plot distribution by hydrogeological units

nearest_neighbor_clc_df %>% 
  dplyr::filter(distance == max_distance) %>% 
  plot_histogram_custom2() +
  theme(axis.title.x=element_blank())

nearest_neighbor_clc_df$gr_name %>% base::unique()
##  [1] Nord- und mitteldeutsches Lockergesteinsgebiet                
##  [2] Oberrheingraben mit Mainzer Becken und nordhessischem Tertiaer
##  [3] Suedwestdeutsches Grundgebirge                                
##  [4] West- und sueddeutsches Schichtstufen- und Bruchschollenland  
##  [5] Alpenvorland                                                  
##  [6] Alpen                                                         
##  [7] Suedostdeutsches Grundgebirge                                 
##  [8] Mitteldeutsches Bruchschollenland                             
##  [9] West- und mitteldeutsches Grundgebirge                        
## [10] Rheinisch-Westfaelisches Tiefland                             
## 10 Levels: Alpen Alpenvorland ... West- und sueddeutsches Schichtstufen- und Bruchschollenland

Definition of sample size per hydrogeological unit

def_min_size <- nearest_neighbor_clc_df %>% 
  dplyr::filter(distance == max_distance) %>%  
  dplyr::filter(gr_name == 'Suedwestdeutsches Grundgebirge') %>% 
  base::nrow()

Sample training data

set.seed(123)

anthropogenic_obswell_m1 <- nearest_neighbor_clc_df %>% 
  dplyr::filter(distance == max_distance) %>% 
  dplyr::filter(gr_name != 'Alpen') %>%
  group_by(gr_name) %>% 
  sample_n(size = def_min_size) %>% 
  ungroup() %>% 
  bind_rows(dplyr::filter(nearest_neighbor_clc_df, gr_name == 'Alpen' & distance == max_distance))

Plot sampled distribution

anthropogenic_obswell_m1 %>% 
  plot_histogram_custom2()

Select training data from obswells_sp

anthropogenic_obswell_m1 <- lax(obswell_sp %>% dplyr::filter(.$obswell_id %in% anthropogenic_obswell_m1$obswell_id))

Write as csv

anthropogenic_obswell_m1 %>% 
  as_tibble() %>% 
  write_csv2('./processed_data/training_data_anthropogenic_obswell_m1.csv')

5.1.3 Class Natural

Choose minimum distance of observation well from artificial surfaces

min_distance <- 3000

Plot distribution by hydrogeological units

nearest_neighbor_clc_df %>% 
  dplyr::filter(distance >= min_distance) %>% 
  plot_histogram_custom2()

Sample training data

set.seed(123)

natural_obswell_m1 <- nearest_neighbor_clc_df %>% 
  dplyr::filter(distance >= min_distance) %>% 
  sample_n(size = base::nrow(anthropogenic_obswell_m1))

Plot sampled distribution

natural_obswell_m1 %>% 
  plot_histogram_custom2()

Select training data from obswells_sp

natural_obswell_m1 <- lax(obswell_sp %>% dplyr::filter(.$obswell_id %in% natural_obswell_m1$obswell_id))

Write as csv

natural_obswell_m1 %>% 
  as_tibble() %>% 
  write_csv2('./processed_data/training_data_natural_obswell_m1.csv')

5.1.4 Visualization of Training Data

map5 <- map1 %>% create_custom_html_leafletmap4()

Plot the leaflet GIS

map5

Save the leaflet GIS as html

saveWidget(map5, file='map5.html')

5.2 Method 2 - Water Production Sites

5.2.1 Data Preparation

CRS Transformation

public_water_prod_sp <- public_water_prod_sp %>%
  spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))

mining_water_prod_sp <- mining_water_prod_sp %>%
  spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))

powerplant_water_prod_sp <- powerplant_water_prod_sp %>%
  spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))

reservoir_sp <- reservoir_sp %>%
  spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs'))

Adding Source

public_water_prod_sp@data <- public_water_prod_sp@data %>%
  as_tibble() %>%
  dplyr::mutate(source = 'public water production')

mining_water_prod_sp@data <- mining_water_prod_sp@data %>%
  as_tibble() %>%
  dplyr::mutate(source = 'mining water production')

powerplant_water_prod_sp@data <- powerplant_water_prod_sp@data %>%
  as_tibble() %>%
  dplyr::mutate(source = 'powerplant water production')

reservoir_sp@data <- reservoir_sp@data %>%
  as_tibble() %>%
  dplyr::mutate(source = 'reservoir')

Specific Manipulation Filter out only types of water production that affect groundwater directly (Grundwasser, Mehrfachentnahme)

mining_water_prod_sp <- mining_water_prod_sp %>% 
  dplyr::filter(Art_Gewinn == 'Grundwasser' | Art_Gewinn == 'Mehrfachentnahme')

public_water_prod_sp <- public_water_prod_sp %>% 
  dplyr::filter(Art_Gewinn == 'Grundwasser' | Art_Gewinn == 'Mehrfachentnahme')

powerplant_water_prod_sp <- powerplant_water_prod_sp %>% 
  dplyr::filter(Art_Gewinn == 'Grundwasser' | Art_Gewinn == 'Mehrfachentnahme')

Join Points

water_production_join_sp <- raster::union(mining_water_prod_sp, public_water_prod_sp)
water_production_join_sp <- raster::union(water_production_join_sp, powerplant_water_prod_sp)

Add unique IDs

water_production_join_sp <- water_production_join_sp %>% 
  dplyr::mutate(id_unique = paste0('id_', seq(1,length.out = base::nrow(water_production_join_sp))))

Plot in Map

pal2 <- colorFactor(palette = 'Set1', domain = base::as.factor(water_production_join_sp$source) %>% base::unique())
labels <- c('powerplant water production', 'mining water production', 'public water production')

Create a leaflet GIS

map2 <- map1 %>% create_custom_html_leafletmap2()

Plot the leaflet GIS

map2

Save the leaflet widget

saveWidget(map2, file='map2.html')

Find nearest neighbors

obswell_sp_coords <- obswell_sp %>% 
  spTransform(CRS('+proj=utm +zone=32 +ellps=WGS84 +datum=WGS84 +units=m +no_defs')) %>% 
  coordinates() %>% 
  as_tibble()

water_production_join_sp_coords <- water_production_join_sp %>% 
  spTransform(CRS('+proj=utm +zone=32 +ellps=WGS84 +datum=WGS84 +units=m +no_defs'))  %>% 
  coordinates() %>% 
  as_tibble()

temp_result <- water_production_join_sp_coords %>% 
  nn2(obswell_sp_coords, k = 1)

nearest_neighbors <- tibble(index = temp_result$nn.idx[,1],
                            distance = temp_result$nn.dists[,1])

obswell_sp <- obswell_sp %>% 
  dplyr::mutate(nn_index = nearest_neighbors$index,
                nn_distance = nearest_neighbors$distance)

Plot the distribution of the distance

nearest_neighbors %>% plot_histogram_custom3()

5.2.2 Class Antropogenic

For the selection of observation wells that are antropogenically influenced, we choose wells from cities that use groundwater for drinking purposes or have shallow groundwater table. A shallow groundwater table causes a groundwater abstraction for constructions leading to an anthropogenically influenced groundwater table.

anthropogenic_obswell_m2 <- obswell_sp %>% 
  dplyr::filter(nn_distance < 1000)

anthropogenic_obswell_m2 %>% 
  as_tibble() %>% 
  write_csv2('./processed_data/training_data_anthropogenic_obswell_m2.csv')

Plot distribution of the distance of antropogenic training data

anthropogenic_obswell_m2@data %>% ggplot() + 
  geom_histogram(
    aes(nn_distance),
    binwidth = 100,
    col = 'black',
    fill = 'red',
    alpha = .5
  )

5.2.3 Class Natural

natural_obswell_m2 <- obswell_sp %>% 
  dplyr::arrange(nn_distance) %>% 
  dplyr::slice(utils::tail(row_number(), 
                           base::nrow(anthropogenic_obswell_m2)))

natural_obswell_m2 %>% 
  as_tibble() %>% 
  write_csv2('./processed_data/training_data_natural_obswell_m2.csv')

Plot distribution of the distance of natural training data

natural_obswell_m2@data %>% ggplot() + 
  geom_histogram(
    aes(nn_distance),
    binwidth = 1000,
    col = 'black',
    fill = 'red',
    alpha = .5
  )

5.2.4 Visualization of Training Data

map5 <- map1 %>% create_custom_html_leafletmap5()

Plot the leaflet GIS

map5

Save the leaflet GIS as html

saveWidget(map5, file='map5.html')

6 Further Analysis

Add comparison between water prod sites and artificial surfaces !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# water_production_join_sp
# clc2018_data_anthropogenic
# 
# leaflet(width = "100%") %>%
#   addTiles(group = "OSM (default)") %>% 
#   addProviderTiles('Hydda.Base', group = "Hydda.Base") %>%
#   addCircleMarkers(
#     data = water_production_join_sp,
#     radius = 2.5,
#     fillColor = ~pal2(base::as.factor(water_production_join_sp$source)),
#     stroke = FALSE,
#     weight = 1,
#     fillOpacity = 1,
#     group = 'Water Production'
#   ) %>% 
#   addLegend("bottomleft",
#             pal = pal2,
#             values = base::as.factor(water_production_join_sp$source),
#             title = "Legend",
#             labFormat = function(type, cuts, p) {  # Here's the trick
#               paste0(labels)
#             },
#             opacity = 1,
#             group = 'Water Production'
#   ) %>%
#   addPolygons(data = clc2018_data_anthropogenic %>% spTransform(CRS('+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs')),
#               stroke = FALSE, 
#               fillOpacity = 0.5,
#               group = 'Artificial Surfaces') %>% 
#   # addLegend("bottomleft",
#   #           title = "Legend",
#   #           opacity = 1,
#   #           group = 'Artificial Surfaces'
#   # ) %>%
#   addLayersControl(
#     baseGroups = c("OSM (default)", 'Hydda.Base'),
#     overlayGroups = c('Water Production', "Artificial Surfaces"),
#     options = layersControlOptions(collapsed = FALSE)
#   )
# 
# nearest_neighbor_clc <- nngeo::st_nn(
#   sf::st_as_sf(obswell_sp),
#   sf::st_as_sf(clc2018_data_anthropogenic),
#   k = 1,
#   returnDist = TRUE
# )

Wasserwerke in Deutschland - NRW: https://www.elwasweb.nrw.de/elwas-web/index.jsf#